#############################################################################
# FigureC2.R
# Aim: to replicate Figure C2 in Atsusaka and Stevenson (2021)
# Run time: about 53 seconds
#############################################################################

rm(list=ls())
start <- Sys.time()
library(tidyverse)

# Installing cWise to get bs.est function
library(devtools)
devtools::install_github("YukiAtsusaka/cWise")
library(cWise)

#############################################################################
# (1) DATA GENERATING PROCESS
#############################################################################
i.logit <- function(XB){ exp(XB)/(1 + exp(XB))}
set.seed(12345)

N = 100000 # Entire Finite Population

beta  = c(-1.5, 0.5, 0.02)                           # Unknown parameter
theta = c(2, -0.1, -0.01)                            # Unknown parameter
female = rbinom(n=N, size=1, prob=0.5)               # 50% Male, 50% Female
age = rpois(n=N, lambda=30)                          # Age with mean 30
x = as.matrix(data.frame(rep(1, N), female, age))

XB =  x %*% beta                                     # Linear aggregator with betas
XT =  x %*% theta                                    # Linear aggregator with thetas
pi.x    = i.logit(XB)                                # Pi|Male
gamma.x = i.logit(XT)                                # Gamma|Male

p = 0.1                                              # Randomization probability
p2 = 0.15                                            # Randomization probability for the anchor question
Attentive = rbinom(n=N, size=1, prob=gamma.x)        # Attentive R or not

# CHECK
# hist(pi.x[female==1], col="firebrick4", xlim=c(0.2,0.5), xlab="Proportion w/ Sensitive Traits")
# hist(pi.x[female==0], col="navy", add=T)
# mean(pi.x[female==1])
# mean(pi.x[female==0])

# SENSITIVE QUESTION OF INTERST
StatementA = rbinom(n=N, size=1, prob=pi.x)          # Sesitive item
StatementB = rbinom(n=N, size=1, prob=p)             # Randomization item
True.res = ifelse(StatementA != StatementB, 0, 1)    # Survey answer

Obs.res = rep(NA, N)
Obs.res[Attentive==1] = True.res[Attentive==1]       # Observed answer for attentive Rs
Obs.res[Attentive==0] = rbinom(n=length(N[Attentive==0]),
                               size=1, prob=0.5)     # Observed answer, otherwise


# NON-SENSITIVE ANCHOR QUESTION
StatementC = 0                                       # Attention "c"heck item (True prevalence is 0 for everyone)
StatementD = rbinom(n=N, size=1, prob=p2)            # Anchor question
True.res.prime = ifelse(StatementC != StatementD, 0, 1) # Survey answer in Anchor Question

Obs.res.prime = rep(NA, N)
Obs.res.prime[Attentive==1] = True.res.prime[Attentive==1] # Observed answer for attentive Rs
Obs.res.prime[Attentive==0] = rbinom(n=length(N[Attentive==0]),
                                     size=1, prob=0.5)# Observed answer, otherwise

Y = Obs.res
A = Obs.res.prime

dat <- cbind(Y, female, age, A, p, p2)                          # Observed data
dat <- as_tibble(dat)
head(dat)

mean(StatementA)            # Ground Truth
mean(StatementA[female==1]) # Ground Truth
mean(StatementA[female==0]) # Ground Truth


#############################################################################
# (2) SAMPLING
#############################################################################

dat.f = dat[female==1,]
dat.m = dat[female==0,]

Nvec = c(1000, 2000, 3000, 4000, 5000) # Sample size
point = NA
point.w = NA
low = NA
low.w = NA
high = NA
high.w = NA

for(i in 1:length(Nvec)){

Nsamp = Nvec[i]

samp.f = dat.f[sample(nrow(dat.f), Nsamp*0.7), ] # Prob(Sample)=0.7
samp.m = dat.m[sample(nrow(dat.m), Nsamp*0.3), ] # Prob(Sample)=0.3
samp.com = rbind(samp.f, samp.m) # Combine
samp.com$weight = ifelse(samp.com$female==1, 1/0.7, 1/0.3)


#############################################################################
# (3) RESULTS
#############################################################################

est.w <- bc.est(Y=Y, A=A, weight=weight, p=0.1, p.prime=0.15, data=samp.com)
est   <- bc.est(Y=Y, A=A, p=0.1, p.prime=0.15, data=samp.com)


point.w[i] = est.w$Results[2,1]
low.w[i] = est.w$Results[2,3]
high.w[i] = est.w$Results[2,4]

point[i] = est$Results[2,1]
low[i] = est$Results[2,3]
high[i] = est$Results[2,4]

}


pdf("FigureC2.pdf", width=8, height=4.5)
par(mfrow=c(1,2))
# NO WEIGHTING
plot(point ~ Nvec, ylim=c(0.2,0.6), type="n",
     ylab="Proportion w/ Sensitive Traits",
     xlab="Sample Size", cex.lab=1.5)
abline(h=mean(StatementA), lty=1,col="firebrick4", lwd=4)

lines(point ~ Nvec, lwd=4, col=alpha("navy",1), type="b")
lines(high ~ Nvec, lty=2, lwd=2, col=alpha("navy",1), type="b")
lines(low ~ Nvec, lty=2, lwd=2, col=alpha("navy",1), type="b")
legend("topleft", legend=c("Unweighted Estimate", "Ground Truth"),             # Add legend
       col=c("navy", "firebrick4"), lty=c(1,1), lwd=c(4,4), cex=1.2)

# WITH WEIGHTING
plot(point.w ~ Nvec, ylim=c(0.2,0.6), type="n",
     ylab="Proportion w/ Sensitive Traits",
     xlab="Sample Size", cex.lab=1.5)
abline(h=mean(StatementA), lty=1,col="firebrick4", lwd=4)

lines(point.w ~ Nvec, lwd=4, type="b")
lines(high.w ~ Nvec, lty=2, lwd=2, col="black", type="b")
lines(low.w ~ Nvec, lty=2, lwd=2, col="black", type="b")
legend("topleft", legend=c("Weighted Estimate", "Ground Truth"),             # Add legend
       col=c("black", "firebrick4"), lty=c(1,1), lwd=c(4,4), cex=1.2)

dev.off()

end <- Sys.time()
start - end

#############################################################################
# END OF THIS R SOURCE FILE
#############################################################################
